import os
import spacy
from tqdm import tqdm
import pandas as pd
import numpy as np
import ast
import plotly.express as px
from sklearn.manifold import TSNE
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from tensorflow import keras
import transformers
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from transformers import TFAutoModel, AutoTokenizer
from transformers import AutoConfig
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
tqdm.pandas()
embedding_path = r"C:\Holmusk\analysis"
def load_embedding(file_name = "bert_embedding.csv"):
emb = pd.read_csv(os.path.join(embedding_path,file_name))
emb.drop(['Unnamed: 0'], axis=1, inplace=True)
return emb
# Load the sapcy model
nlp_sm = spacy.load("en_core_web_sm")
nlp_lg = spacy.load("en_core_web_lg")
#data = pd.read_csv("C:\Holmusk\MedicalNotesNLPChallenge\cleaned_chunked_clinic_notes.csv")
data = pd.read_csv("C:\Holmusk\MedicalNotesNLPChallenge\ClinNotes.csv")
medical_terms = pd.read_csv("C:\Holmusk\MedicalNotesNLPChallenge\MedicalConcepts.csv")
medical_terms['Term1'] = medical_terms['Term1'].apply(lambda x: x.lower())
medical_terms['Term2'] = medical_terms['Term2'].apply(lambda x: x.lower())
medical_terms.drop_duplicates(keep='first', inplace=True)
medical_terms.shape
def calculate_similarity(row):
similarity = cosine_similarity([row['term1']], [row['term2']])[0][0]
return similarity
def getEmbedding(row, nlp):
doc = nlp(row)
embedding = doc.vector
return embedding
def plotKmeansClusteredEmbeddings(embeddings, model):
kmeans = KMeans(n_clusters=3)
cluster_labels = kmeans.fit_predict(embeddings)
tsne = TSNE(n_components=2)
reduced_embeddings = tsne.fit_transform(embeddings)
plt.figure(figsize=(12, 6))
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=cluster_labels)
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.title(f'K-means Clustering of {model} Embeddings')
plt.show()
def plotEmbeddings(embeddings, categories, model):
df = pd.DataFrame(embeddings)
df["Data_Point"] = categories
# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
embeddings_tsne = tsne.fit_transform(df.drop(["Data_Point"], axis=1))
df["TSNE_X"] = embeddings_tsne[:, 0]
df["TSNE_Y"] = embeddings_tsne[:, 1]
fig = px.scatter(df, x="TSNE_X", y="TSNE_Y", hover_data={"Data_Point": True}, color="Data_Point")
fig.update_traces(hovertemplate="Data Point: %{customdata[0]}")
fig.update_layout(title=f"Embeddings Visualization with {model} Embedding size as: {len(embeddings[0])}")
# Show the plot
fig.show()
fig.write_html(f"{model}_{len(embeddings[0])}.html")
res_sm = data['notes'].apply(getEmbedding, args=(nlp_sm,))
pd.DataFrame(list(res_sm)).to_csv(os.path.join(embedding_path,'row_word2vec_96_embedding.csv'))
res_sm = np.array(load_embedding(file_name = "row_word2vec_96_embedding.csv"))
plotKmeansClusteredEmbeddings(list(res_sm), 'Word2Vec 96')
Note that above approach to verify embedding quality is not good for the kind of data that we are dealing with
res_sm = np.array(load_embedding(file_name = "row_word2vec_96_embedding.csv"))
plotEmbeddings(list(res_sm), data['category'].values, 'word2vec')
top = len(medical_terms)
res1 = medical_terms["Term1"][:top].apply(getEmbedding, args=(nlp_sm,))
res2 = medical_terms["Term2"][:top].apply(getEmbedding, args=(nlp_sm,))
res = pd.DataFrame({'term1':res1, 'term2': res2}).apply(calculate_similarity, axis=1)
w2v_sm = res
print("Total number of pairs = ", top)
print("Total cosine_similarity = ", sum(res))
print("mean cosine_similarity = ", np.mean(res))
res_lg = data['notes'].apply(getEmbedding, args=(nlp_lg,))
pd.DataFrame(list(res_lg)).to_csv(os.path.join(embedding_path,'raw_word2vec_300_embedding.csv'))
res_lg = np.array(load_embedding(file_name = "raw_word2vec_300_embedding.csv"))
plotEmbeddings(list(res_lg), data['category'].values, 'word2vec')
top = len(medical_terms)
res1 = medical_terms["Term1"][:top].apply(getEmbedding, args=(nlp_lg,))
res2 = medical_terms["Term2"][:top].apply(getEmbedding, args=(nlp_lg,))
res = pd.DataFrame({'term1':res1, 'term2': res2}).apply(calculate_similarity, axis=1)
w2v_lg = res
print("Total number of pairs = ", top)
print("Total cosine_similarity = ", sum(res))
print("mean cosine_similarity = ", np.mean(res))
# Load the pre-trained ELMo model
elmo = hub.load("https://tfhub.dev/google/elmo/3")
def get_elmo_embedding(sentences):
embeddings = elmo.signatures["default"](tf.constant([sentences]))["elmo"]
sentence_embeddings = tf.reduce_mean(embeddings, axis=1)
return sentence_embeddings.numpy()[0]
res_elmo = data['notes'].apply(get_elmo_embedding)
pd.DataFrame(list(res_elmo)).to_csv(os.path.join(embedding_path,'raw_elmo_pretrained_embeddings.csv'))
res_elmo = np.array(load_embedding(file_name = "raw_elmo_pretrained_embeddings.csv"))
plotEmbeddings(list(res_elmo), data['category'].values, 'Elmo')
top = len(medical_terms)
res1 = medical_terms["Term1"][:top].apply(get_elmo_embedding)
res2 = medical_terms["Term2"][:top].apply(get_elmo_embedding)
res = pd.DataFrame({'term1':res1, 'term2': res2}).apply(calculate_similarity, axis=1)
print("Total number of pairs = ", top)
print("Total cosine_similarity = ", sum(res))
print("mean cosine_similarity = ", np.mean(res))
def get_bert_embedding(sentence, model, tokenizer):
tokens = tokenizer.encode_plus(sentence, padding=True, truncation=True, max_length=512, return_tensors="tf")
input_ids = tokens["input_ids"]
attention_mask = tokens["attention_mask"]
outputs = model(input_ids, attention_mask=attention_mask)
hidden_states = outputs.last_hidden_state
mask = tf.cast(tf.expand_dims(attention_mask, axis=-1), tf.float32)
masked_hidden_states = hidden_states * mask
sentence_representations = tf.reduce_sum(masked_hidden_states, axis=1) / tf.reduce_sum(mask, axis=1)
return sentence_representations
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModel.from_pretrained(model_name)
res_bert = data['notes'].apply(get_bert_embedding, args=(model,tokenizer))
res_bert1 = [res[0].numpy() for res in res_bert]
pd.DataFrame(res_bert1).to_csv(os.path.join(embedding_path,'raw_bert_embedding.csv'))
res_bert1 = np.array(load_embedding(file_name = "raw_bert_embedding.csv"))
plotEmbeddings(list(res_bert1), data['category'].values, 'BERT')
top = len(medical_terms)
res1 = medical_terms["Term1"][:top].apply(get_bert_embedding)
res2 = medical_terms["Term2"][:top].apply(get_bert_embedding)
res1 = [res[0].numpy() for res in res1]
res2 = [res[0].numpy() for res in res2]
res = pd.DataFrame({'term1':res1, 'term2': res2}).apply(calculate_similarity, axis=1)
bert = res
print("Total number of pairs = ", top)
print("Total cosine_similarity = ", sum(res))
print("mean cosine_similarity = ", np.mean(res))
bio_model_name = "dmis-lab/biobert-v1.1"
bio_tokenizer = AutoTokenizer.from_pretrained(bio_model_name)
bio_model = TFAutoModel.from_pretrained(bio_model_name, from_pt=True)
res_bio_bert = data['notes'].apply(get_bert_embedding, args=(bio_model,bio_tokenizer))
res_bio_bert1 = [res[0].numpy() for res in res_bio_bert]
pd.DataFrame(res_bio_bert1).to_csv(os.path.join(embedding_path,'raw_bio_bert_embedding.csv'))
res_bio_bert1 = np.array(load_embedding(file_name = "raw_bio_bert_embedding.csv"))
plotEmbeddings(list(res_bio_bert1), data['category'].values, 'BioBERT')
top = len(medical_terms)
res1 = medical_terms["Term1"][:top].apply(get_bert_embedding, args=(bio_model,bio_tokenizer))
res2 = medical_terms["Term2"][:top].apply(get_bert_embedding, args=(bio_model,bio_tokenizer))
res1 = [res[0].numpy() for res in res1]
res2 = [res[0].numpy() for res in res2]
res = pd.DataFrame({'term1':res1, 'term2': res2}).apply(calculate_similarity, axis=1)
bio_bert = res
print("Total number of pairs = ", top)
print("Total cosine_similarity = ", sum(res))
print("mean cosine_similarity = ", np.mean(res))
clinic_model_name = "emilyalsentzer/Bio_ClinicalBERT"
clinic_tokenizer = AutoTokenizer.from_pretrained(clinic_model_name)
clinic_model = TFAutoModel.from_pretrained(clinic_model_name)
res_clinic_bert = data['notes'].apply(get_bert_embedding, args=(clinic_model, clinic_tokenizer))
res_clinic_bert1 = [res[0].numpy() for res in res_clinic_bert]
pd.DataFrame(res_clinic_bert1).to_csv(os.path.join(embedding_path,'raw_clinic_bert_embedding.csv'))
res_clinic_bert1 = np.array(load_embedding(file_name = "raw_clinic_bert_embedding.csv"))
plotEmbeddings(list(res_clinic_bert1), data['category'].values, 'ClinicalBERT')
top = len(medical_terms)
res1 = medical_terms["Term1"][:top].apply(get_bert_embedding, args=(clinic_model, clinic_tokenizer))
res2 = medical_terms["Term2"][:top].apply(get_bert_embedding, args=(clinic_model, clinic_tokenizer))
res1 = [res[0].numpy() for res in res1]
res2 = [res[0].numpy() for res in res2]
res = pd.DataFrame({'term1':res1, 'term2': res2}).apply(calculate_similarity, axis=1)
clinical_bert = res
print("Total number of pairs = ", top)
print("Total cosine_similarity = ", sum(res))
print("mean cosine_similarity = ", np.mean(res))
blue_model_name = "bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12"
blue_tokenizer = AutoTokenizer.from_pretrained(blue_model_name)
blue_model = TFAutoModel.from_pretrained(blue_model_name, from_pt=True)
res_blue_bert = data['notes'].apply(get_bert_embedding, args=(blue_model,blue_tokenizer))
res_blue_bert1 = [res[0].numpy() for res in res_blue_bert]
pd.DataFrame(res_blue_bert1).to_csv(os.path.join(embedding_path,'raw_blue_bert_embedding.csv'))
res_blue_bert1 = np.array(load_embedding(file_name = "raw_blue_bert_embedding.csv"))
plotEmbeddings(list(res_blue_bert1), data['category'].values, 'BlueBERT')
top = len(medical_terms)
res1 = medical_terms["Term1"][:top].apply(get_bert_embedding, args=(blue_model,blue_tokenizer))
res2 = medical_terms["Term2"][:top].apply(get_bert_embedding, args=(blue_model,blue_tokenizer))
res1 = [res[0].numpy() for res in res1]
res2 = [res[0].numpy() for res in res2]
res = pd.DataFrame({'term1':res1, 'term2': res2}).apply(calculate_similarity, axis=1)
blue_bert = res
print("Total number of pairs = ", top)
print("Total cosine_similarity = ", sum(res))
print("mean cosine_similarity = ", np.mean(res))
chars = ['word2vec_small', 'word2vec_large', 'bert', 'bio_bert', 'clinical_bert', 'blue_bert']
numbers = [round(np.mean(w2v_sm) * 100, 2), round(np.mean(w2v_lg) * 100, 2), round(np.mean(bert) * 100, 2),
round(np.mean(bio_bert) * 100, 2), round(np.mean(clinical_bert) * 100, 2), round(np.mean(blue_bert) * 100, 2)]
pd.DataFrame({"chart": chars, "similarity_score" : numbers}).to_csv("similarity_score.csv")
similarity_score = pd.read_csv("similarity_score.csv")
chars = similarity_score['chart']
numbers = similarity_score['similarity_score']
import plotly.graph_objects as go
import numpy as np
fig = go.Figure(data=go.Bar(x=chars, y=numbers, text=numbers, textposition='auto', hoverinfo='none'))
fig.update_layout(xaxis_title='Models', yaxis_title='Cosine Similarity (%)', title='Model vs Cosine Similarity on Medical Keyword Pairs')
fig.show()
def get_embedding_from_finetuned_bert(sentence, tuned_model, tokenizer):
max_len = 512
tokens = tokenizer(sentence, truncation=True, padding=True, max_length=max_len, return_tensors="tf")
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']
outputs = tuned_model([input_ids,attention_mask])
hidden_states = outputs['last_hidden_state']
mask = tf.cast(tf.expand_dims(attention_mask, axis=-1), tf.float32)
masked_hidden_states = hidden_states * mask
sentence_representations = tf.reduce_sum(masked_hidden_states, axis=1) / tf.reduce_sum(mask, axis=1)
return sentence_representations
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tuned_model = keras.models.load_model(r'C:\Holmusk\model\clinical_bert_fine_tuned_for_notes.h5',
custom_objects={"TFBertModel": transformers.TFBertModel})
tuned_clinic_bert = data['notes'].apply(get_embedding_from_finetuned_bert, args=(tuned_model, tokenizer))
tuned_clinic_bert1 = [res[0].numpy() for res in tuned_clinic_bert]
pd.DataFrame(tuned_clinic_bert1).to_csv(os.path.join(embedding_path,'raw_tuned_clinic_notes_bert_embedding_new.csv'))
tuned_clinic_bert1 = np.array(load_embedding(file_name = "raw_tuned_clinic_notes_bert_embedding_new.csv"))
plotEmbeddings(list(tuned_clinic_bert1), data['category'].values, 'TunedClinicalBERT')
top = len(medical_terms)
res1 = medical_terms["Term1"][:top].apply(get_embedding_from_finetuned_bert, args=(tuned_model, tokenizer))
res2 = medical_terms["Term2"][:top].apply(get_embedding_from_finetuned_bert, args=(tuned_model, tokenizer))
res1 = [res[0].numpy() for res in res1]
res2 = [res[0].numpy() for res in res2]
res = pd.DataFrame({'term1':res1, 'term2': res2}).apply(calculate_similarity, axis=1)
finetuned_clinical_bert = res
print("Total number of pairs = ", top)
print("Total cosine_similarity = ", sum(res))
print("mean cosine_similarity = ", np.mean(res))
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
keyword_tuned_model = keras.models.load_model(r'C:\Holmusk\model\clinical_bert_fine_tuned_for_medic_keywords.h5',
custom_objects={"TFBertModel": transformers.TFBertModel})
keyword_tuned_clinic_bert = data['notes'].apply(get_embedding_from_finetuned_bert, args=(keyword_tuned_model, tokenizer))
keyword_tuned_clinic_bert1 = [res[0].numpy() for res in keyword_tuned_clinic_bert]
pd.DataFrame(tuned_clinic_bert1).to_csv(os.path.join(embedding_path,'raw_keyword_tuned_bert_embedding_new.csv'))
tuned_clinic_bert1 = np.array(load_embedding(file_name = "raw_keyword_tuned_bert_embedding_new.csv"))
plotEmbeddings(list(tuned_clinic_bert1), data['category'].values, 'KeywordTunedClinicalBERT')
top = len(medical_terms)
res1 = medical_terms["Term1"][:top].apply(get_embedding_from_finetuned_bert, args=(keyword_tuned_model, tokenizer))
res2 = medical_terms["Term2"][:top].apply(get_embedding_from_finetuned_bert, args=(keyword_tuned_model, tokenizer))
res1 = [res[0].numpy() for res in res1]
res2 = [res[0].numpy() for res in res2]
res = pd.DataFrame({'term1':res1, 'term2': res2}).apply(calculate_similarity, axis=1)
finetuned_clinical_bert_for_keywords = res
print("Total number of pairs = ", top)
print("Total cosine_similarity = ", sum(res))
print("mean cosine_similarity = ", np.mean(res))
tuned_keyword_clinic_bert1 = np.array(load_embedding(file_name = "raw_tuned_keyword_clinic_notes_bert_embedding_new.csv"))
plotEmbeddings(list(tuned_keyword_clinic_bert1), data['category'].values, 'NotesAndKeywordTunedClinicalBERT')
import plotly.graph_objects as go
import numpy as np
chars = ['word2vec_small', 'word2vec_large', 'bert', 'bio_bert', 'clinical_bert', 'blue_bert','clinic_notes_finetuned_bert',
"keyword_based_finetuned_bert" ]
numbers = [round(np.mean(w2v_sm) * 100, 2), round(np.mean(w2v_lg) * 100, 2), round(np.mean(bert) * 100, 2),
round(np.mean(bio_bert) * 100, 2), round(np.mean(clinical_bert) * 100, 2), round(np.mean(blue_bert) * 100, 2),
round(np.mean(finetuned_clinical_bert) * 100, 2), round(np.mean(finetuned_clinical_bert_for_keywords) * 100, 2)]
# Create a bar chart using Plotly
fig = go.Figure(data=go.Bar(x=chars, y=numbers, text=numbers, textposition='auto', hoverinfo='none'))
# Update layout with labels and title
fig.update_layout(xaxis_title='Models', yaxis_title='Cosine Similarity (%)', title='Model vs Cosine Similarity on Medical Keyword Pairs')
# Display the chart
fig.show()
pd.DataFrame({"chart": chars, "similarity_score" : numbers}).to_csv("all_model_similarity.csv")